XYX wants to invest in Cab industry as per Go-To-Market strategy. XYZ needs to indenitfy the right company to make investment.
Time period of data - 31/02/2016 - 31/12/2018
Cab_Data - Transaction for 2 cab companies
Customer_ID - Customer's demographic details
Transaction_ID - Transaction to Customer Mapping and Payment mode
City - US city population and number of cab users
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sb
%matplotlib inline
cabData = pd.read_csv("DataSets/Cab_Data.csv")
customerData = pd.read_csv("DataSets/Customer_ID.csv")
transactionData = pd.read_csv("DataSets/Transaction_ID.csv")
cityData = pd.read_csv("DataSets/City.csv")
cabData.head()
customerData.head()
transactionData.head()
cityData.head()
print(cabData.info())
print(customerData.info())
print(transactionData.info())
print(cityData.info())
cabData["Travel Date"] = pd.TimedeltaIndex(cabData["Date of Travel"].astype(int), unit='d') + datetime(1900, 1, 1)
cabData.head()
customerMerge = pd.merge(customerData, transactionData, on='Customer ID')
cityMerge = pd.merge(cabData, cityData, on='City')
df = pd.merge(cityMerge, customerMerge, on='Transaction ID')
# df.to_csv('completeData.csv')
df.head()
pd.isnull(df).sum(axis=0)
df.info()
df.corr()
plt.figure(figsize=(40,30))
ax = sb.countplot(df['City'], hue="Company", data=df, palette=["#FFC0CB","#FFFAA0"])
ax.set_title("Total Rides by City", fontsize=20)
for p in ax.patches:
ax.annotate(f'\n{p.get_height()}', (p.get_x()+0.2, p.get_height()), ha='center', va='top', color='black', size=14)
plt.show()
plt.figure(figsize=(15, 15))
plt.title("Distribution of Total Rides by City")
df["City"].value_counts().plot(kind = 'pie')
plt.axes().set_ylabel('')
df["Profit"] = df["Price Charged"] - df["Cost of Trip"]
df.head()
profitData = df.groupby("Company").mean()
profitData.head()
companyData = df.groupby("Company").count()
companyData.head()
companyData.info()
plt.figure(figsize=(10, 10))
plt.title("Total Users by Company")
plt.pie(companyData["Users"], labels=companyData.index, autopct='%1.1f%%', startangle=90, colors=["#FFC0CB","#FFFAA0"])
plt.show()
df.plot(x="Company", y=["Profit", "KM Travelled"])
plt.show()
pinkCab = df[df["Company"] == "Pink Cab"]
pinkCab.head()
yellowCab = df[df["Company"] == "Yellow Cab"]
yellowCab.head()
figure, (ax1, ax2) = plt.subplots(1, 2)
# For Pink Cab
ax1.plot(pinkCab["KM Travelled"], pinkCab["Profit"])
ax1.set_title("Pink Cab")
# For Yellow Cab
ax2.plot(yellowCab["KM Travelled"], yellowCab["Profit"])
ax2.set_title("Yellow Cab")
plt.show()
dummy=pinkCab.groupby("KM Travelled")
dummy.head()
label = ["{0} - {1}".format(i, i + 9) for i in range(0, 50, 10)]
df["KM Range"] = pd.cut(df['KM Travelled'], range(0,55, 10), right=False, labels=label)
pivot_KM_profit = df.pivot_table(index = ["KM Range"], columns = 'Company', values = 'Profit', aggfunc = 'sum')
pivot_KM_profit.fillna(0, inplace=True)
ax = pivot_KM_profit.plot.barh(stacked = True, figsize = (15,10), color=["#FFC0CB","#FFFAA0"])
labels = []
for j in pivot_KM_profit.columns:
for i in pivot_KM_profit.index:
if ((j == 0) and (pivot_KM_profit.loc[i][j] < 10) and (pivot_KM_profit.loc[i][j] < sum(pivot_KM_profit.loc[i]))):
label = ""
else:
label = str(round((pivot_KM_profit.loc[i][j]/sum(pivot_KM_profit.loc[i]))*100,1)) + "% "
labels.append(label)
plt.title("PROFIT ANALISYS PER KM ")
plt.show()
label_Cost = ["{0} - {1}".format(i, i + 49) for i in range(0, 700, 50)]
df["Cost of Trip Grouped"] = pd.cut(df['Cost of Trip'], range(0,710, 50), right=False, labels=label_Cost)
df.head()
pivotCostTripPrice = df.pivot_table(index = ["Cost of Trip Grouped"], columns = 'Company', values = 'Price Charged')
pivotCostTripPrice.fillna(0, inplace=True)
ax = pivotCostTripPrice.plot.barh(stacked = True, figsize = (15,10), color=["#FFC0CB","#FFFAA0"])
plt.title("Cost of Trip and Price Charged comparison")
plt.show()
yellowCab.head()
sb.catplot(x="Company", y="Cost of Trip", hue="Gender", kind="bar", data=df)
sb.scatterplot(x="Price Charged", y="Cost of Trip", hue="Company", data=df, palette=["#FFC0CB","#FFFAA0"])
sb.scatterplot(x="Price Charged", y="Cost of Trip", data=yellowCab, color="#FFFAA0")
# sb.set(rc={'figure.figsize':(25,25)})
plt.figure(figsize=(30, 30))
fig, ax = plt.subplots()
ax.xaxis.set_ticks(np.arange(50, 1700, 50))
ax.yaxis.set_ticks(np.arange(100, 600, 20))
sb.scatterplot(x="Price Charged", y="Cost of Trip", data=pinkCab, s=15, color="pink")
fig.set_size_inches(25,15)
# ax.set(ylim=(1, 800))
# sb.set(rc={'figure.figsize':(30,30)})
plt.figure(figsize=(30, 30), dpi=80)
fig, ax = plt.subplots()
sb.scatterplot(x="Price Charged", y="Cost of Trip", hue="Company", data=df, palette=["#FFC0CB","#FFFAA0"])
fig.set_size_inches(15,15)
ax.set(ylim=(1, 800))
ax.xaxis.set_ticks(np.arange(100, 2000, 80))
ax.yaxis.set_ticks(np.arange(100, 800, 20))
fig, ax = plt.subplots()
sb.barplot(x='Cost of Trip Grouped', y='Price Charged', data=df, palette=["#FFC0CB","#FFFAA0"], hue='Company')
fig.set_size_inches(15,15)
fig, ax = plt.subplots()
sb.barplot(x='Cost of Trip Grouped', y='KM Travelled', data=df, palette=["#FFC0CB","#FFFAA0"], hue='Company');
ax.yaxis.set_ticks(np.arange(0, 50, 5))
fig.set_size_inches(15,15)
sb.pairplot(df, hue="Company")
ax = sb.regplot(x="Price Charged", y="Profit", data=yellowCab, color="gold")
ax = sb.regplot(x="Price Charged", y="Profit", data=pinkCab, color="pink")
ax = sb.regplot(x="Price Charged", y="Cost of Trip", data=yellowCab, color="gold")
ax = sb.regplot(x="Price Charged", y="Cost of Trip", data=pinkCab, color="pink")